package sample;

import java.io.*;
import java.util.HashSet;
import java.util.Set;

public class DistinctFileUtil {

    /**
     * 将文件hash取模之后放到不同的小文件中
     * @param targetFile 要去重的文件路径
     * @param splitSize 将目标文件切割成多少份hash取模的小文件个数
     * @return
     */
    public static Integer                                                                              index = 1;
    public static File[] splitFile(String targetFile,int splitSize){
        File file = new File(targetFile);
        BufferedReader reader = null;
        PrintWriter[] pws = new PrintWriter[splitSize];
        File[] littleFiles = new File[splitSize];
        String parentPath = file.getParent();
        File tempFolder = new File(parentPath + File.separator +"test");
        if(!tempFolder.exists()){
            tempFolder.mkdir();
        }
        for(int i=0;i<splitSize;i++){
            littleFiles[i] = new File(tempFolder.getAbsolutePath() + File.separator + index +"-"+ i + ".txt");
            if(littleFiles[i].exists()){
                littleFiles[i].delete();
            }
            try {
                pws[i] = new PrintWriter(littleFiles[i]);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
        }
        try {
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;
            while ((tempString = reader.readLine()) != null) {
                tempString = tempString.trim();
                if(tempString != ""){
                    //关键是将每行数据hash取模之后放到对应取模值的文件中，确保hash值相同的字符串都在同一个文件里面
                    int index = Math.abs(tempString.hashCode() % splitSize);
                    pws[index].println(tempString);
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
            for(int i=0;i<splitSize;i++){
                if(pws[i] != null){
                    pws[i].close();
                }
            }
        }
        return littleFiles;
    }

    /**
     * 对小文件进行去重合并
     * @param littleFiles 切割之后的小文件数组
     * @param distinctFilePath 去重之后的文件路径
     * @param splitSize 小文件大小
     */
    public static void distinct(File[] littleFiles,String distinctFilePath,int splitSize){
        File distinctedFile = new File(distinctFilePath);
        FileReader[] frs = new FileReader[splitSize];
        BufferedReader[] brs = new BufferedReader[splitSize];
        PrintWriter pw = null;
        try {
            if(distinctedFile.exists()){
                distinctedFile.delete();
            }
            distinctedFile.createNewFile();
            pw = new PrintWriter(distinctedFile);
            Set<String> unicSet = new HashSet<String>();
            for(int i=0;i<splitSize;i++){
                if(littleFiles[i].exists()){
                    System.out.println("开始对小文件：" + littleFiles[i].getName() + "去重");
                    frs[i] = new FileReader(littleFiles[i]);
                    brs[i] = new BufferedReader(frs[i]);
                    String line = null;
                    while((line = brs[i].readLine())!=null){
                        if(line != ""){
                            unicSet.add(line);
                        }
                    }
                    for(String s:unicSet){
                        pw.println(s);
                    }
                    unicSet.clear();
                    System.gc();
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e1){
            e1.printStackTrace();
        } finally {
            for(int i=0;i<splitSize;i++){
                try {
                    if(null != brs[i]){
                        brs[i].close();
                    }
                    if(null != frs[i]){
                        frs[i].close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                //合并完成之后删除临时小文件
                if(littleFiles[i].exists()){
                    littleFiles[i].delete();
                }
            }
            if(null != pw){
                pw.close();
            }
        }
    }

    /**
     * 对小文件进行去重合并
     * @param littleFiles1 切割之后的小文件数组
     * @param littleFiles2 切割之后的小文件数组
     * @param distinctFilePath 去重之后的文件路径
     * @param splitSize 小文件大小
     */
    public static void repeat(File[] littleFiles1,File[] littleFiles2,String distinctFilePath,int splitSize){
        File distinctedFile = new File(distinctFilePath);
        FileReader[] frs1 = new FileReader[splitSize];
        BufferedReader[] brs1 = new BufferedReader[splitSize];
        FileReader[] frs2 = new FileReader[splitSize];
        BufferedReader[] brs2 = new BufferedReader[splitSize];
        PrintWriter pw = null;
        try {
            if(distinctedFile.exists()){
                distinctedFile.delete();
            }
            distinctedFile.createNewFile();
            pw = new PrintWriter(distinctedFile);
            Set<String> unicSet = new HashSet<String>();
            for(int i=0;i<splitSize;i++){
                Integer a = 0;
                Integer b = 0;
                Integer c = 0;
                if(littleFiles2[i].exists()){
                    frs1[i] = new FileReader(littleFiles2[i]);
                    brs1[i] = new BufferedReader(frs1[i]);
                    String line = null;
                    while((line = brs1[i].readLine())!=null){
                        if(!line.equals("")){
                            unicSet.add(line);
                        }
                        a++;
                    }
                }
                System.out.println(unicSet.size());
                if(littleFiles1[i].exists()){
                    frs2[i] = new FileReader(littleFiles1[i]);
                    brs2[i] = new BufferedReader(frs2[i]);
                    String line = null;
                    while((line = brs2[i].readLine())!=null){
                        if(!line.equals("")){
                            if(!unicSet.contains(line)){//不存在
                                pw.println(line);
                                c++;
                            }else{
                                unicSet.remove(line);
                            }
                        }
                        b++;
                    }
                }

                unicSet.clear();
                System.gc();
                System.out.println("共有"+b+"个：重复的有"+ a + "个，去重后" +c +"==="+(b-c));
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e1){
            e1.printStackTrace();
        } finally {
            for(int i=0;i<splitSize;i++){
                try {
                    if(null != brs1[i]){
                        brs1[i].close();
                    }
                    if(null != frs1[i]){
                        frs1[i].close();
                    }
                    if(null != brs2[i]){
                        brs2[i].close();
                    }
                    if(null != frs2[i]){
                        frs2[i].close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                //合并完成之后删除临时小文件
                if(littleFiles1[i].exists()){
                    littleFiles1[i].delete();
                }
                if(littleFiles2[i].exists()){
                    littleFiles2[i].delete();
                }
            }
            if(null != pw){
                pw.close();
            }
        }
    }

    public static long getLineNumber(String targetFile) {
        File file = new File(targetFile);
        if (file.exists()) {
            try {
                FileReader fileReader = new FileReader(file);
                LineNumberReader lineNumberReader = new LineNumberReader(fileReader);
                lineNumberReader.skip(Long.MAX_VALUE);
                long lines = lineNumberReader.getLineNumber() + 1;
                fileReader.close();
                lineNumberReader.close();
                return lines;
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return 0;
    }

    public static void main(String[] args) throws IOException {
        int splitSize = 20;
        String duo = "E://test/duo.txt";
        String shao = "E://test/shao.txt";
        String zong = "E://test/zong.txt";
        new File(new DistinctFileUtil().getClass().getResource("").getPath());
//        String duo = "D://data/duo.txt";
//        String shao = "D://data/shao.txt";
//        String zong = "D://data/zong.txt";

//        System.out.println(getLineNumber(duo));
//        System.out.println(getLineNumber(shao));


        File[] files1 = splitFile(duo,splitSize);
        index = index + 1 ;
        File[] files2 = splitFile(shao,splitSize);

         repeat(files1,files2,zong,splitSize);
//        distinct(files,"G://test/bigfile-distinct.txt",splitSize);
    }
}
